/**
 * AbstractWekaCompleteCompositionVector.java
 *
 * Created on Jul 25, 2008, 10:31:48 AM
 *
 * 
 * Helpful hints about weka:
 *  - Building a classifier from the command line uses the Evaluation Class, 
 *    which contains most of the options.
 * 
 * $Id: AbstractWekaCompleteCompositionVector.java 9 2012-03-09 20:51:18Z enzo69mc $
 */

package org.mitre.ccv.weka;

import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.OutputStream;

import java.util.Enumeration;
import java.util.LinkedList;
import java.util.Random;
import java.util.TreeSet;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.mitre.ccv.CompleteCompositionVector;

import weka.classifiers.Classifier;
import weka.classifiers.Evaluation;
import weka.core.Attribute;
import weka.core.FastVector;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.SparseInstance;
import weka.core.xml.KOML;

/**
 * Abstract class for classifying Complete Composition Vectors using Weka.
 *
 * <p>Supporting classes should be able to learn from the vector features (data set) and classify new instances (as raw sequences).
 * They also should have the following <tt>constructor</tt> methods:
 * <ul>
 * <li>Constructor(JSONObject jsonDataSet) - for loading in a data set from a JSON object (as generated by {@link org.mitre.ccv.CompleteCompositionVectorMain}</li>
 * <li>Constructor(String inputFile)    - file name for loading in a saved object (serialized object by {@link saveModelObject)
 * <li>Constructor(ObjectInputStream ois) - stream for loading in a saved object (serialized object by {@link saveModelObject)
 * </ul>
 *
 * @author Marc Colosimo
 * @see http://www.cs.waikato.ac.nz/ml/weka/
 */
public abstract class AbstractWekaCompleteCompositionVector {

    private static final Log LOG = LogFactory.getLog(AbstractWekaCompleteCompositionVector.class);
    /**
     * Want to implement supervised Feature Selection using Weka
     * 
     * example:
     * java weka.attributeSelection.WrapperSubsetEval -I iris.arff 
     * -B weka.classifiers.trees.J48 
     * -S "weka.attributeSelection.GeneticSearch -Z 20 -G 10" -- -C 0.2 
     * 
     */
   
    private FastVector attributes = null;
    
    /**
     * Loads a saved classifier (only as binary for now).
     * <P>
     * This supports compressd files (GZIP).
     * 
     * @param objectInputFileName file path
     * @throws java.io.FileNotFoundException
     * @throws java.io.IOException
     * @throws java.lang.ClassNotFoundException
     */
    public void loadModelObject(String objectInputFileName) 
            throws FileNotFoundException, IOException, ClassNotFoundException 
    {
        InputStream is =  new FileInputStream(objectInputFileName);
        if (objectInputFileName.endsWith(".gz")) {
            is = new GZIPInputStream(is);
        }
        this.loadModelObject(new ObjectInputStream(is));
        
    }

    /**
     * Loads a saved classifier (only as binary for now).
     *
     * @throws java.io.FileNotFoundException
     * @throws java.io.IOException
     */
    public void loadModelObject(ObjectInputStream ois) throws IOException, ClassNotFoundException {
        Classifier classifier = (Classifier) ois.readObject();
        FastVector vectorAttributes = (FastVector) ois.readObject();
        this.setBegin((Integer) ois.readObject());
        this.setEnd((Integer) ois.readObject());
        Integer classIdx = (Integer) ois.readObject();
        ois.close();
        this.setClassifier(classifier);
        this.setAttributes(vectorAttributes);
        if (LOG.isDebugEnabled()) {
            LOG.debug(String.format("Loaded attributes %d and start %d, end %d", vectorAttributes.size(), this.getBegin(), this.getEnd()));
        }
        this.setClassIndex(classIdx);
    }
    
    /**
     * Save the classifier and parameters as a binary object 
     * 
     * @see weka.classifiers.Evaluation.evaluateModel
     * @param classifier 
     * @param objectOutputFileName
     */
    public void saveModelObject(String objectOutputFileName) 
            throws FileNotFoundException, IOException, Exception 
    {
        Classifier classifier = this.getClassifier();
        if (objectOutputFileName.length() != 0) {
            OutputStream os = new FileOutputStream(objectOutputFileName);
            // binary
            if (!(objectOutputFileName.endsWith(".xml") || (objectOutputFileName.endsWith(".koml") && KOML.isPresent()))) {
                if (objectOutputFileName.endsWith(".gz")) {
                    os = new GZIPOutputStream(os);
                }
                ObjectOutputStream oss = new ObjectOutputStream(os);
                oss.writeObject(classifier);
                FastVector attrs = this.getAttributes();
                oss.writeObject(attrs);
                oss.writeObject(this.getBegin());
                oss.writeObject(this.getEnd());
                Integer idx = this.getClassIndex();
                if (idx == null || idx == -1) {
                    LOG.warn("classIndex is null or -1!");
                }
                oss.writeObject(this.getClassIndex());
                oss.flush();
                oss.close();
            } else {
                LOG.fatal("Unable to save as KOML/XML");
                throw new Exception("Unsupported output format");
                /** Output as KOML/XML (.xml/.koml). 
                 * This is broken
                BufferedOutputStream xmlOutputStream = new BufferedOutputStream(os);
                if (objectOutputFileName.endsWith(".xml")) {
                    XMLSerialization xmlSerial = new XMLClassifier();
                    xmlSerial.write(xmlOutputStream, classifier);
                } else // whether KOML is present has already been checked
                // if not present -> ".koml" is interpreted as binary - see above
                if (objectOutputFileName.endsWith(".koml")) {
                    KOML.write(xmlOutputStream, classifier);
                } 
                 * */
                /** Raise Error - unknow ending */
                //xmlOutputStream.close();
            }
        }
    }
    
    /**
     * Performs a (stratified if class is nominal) cross-validation 
     * for a classifier on a set of instances.
     *
     * <P> The classifier is <B>NOT</B> supposed to be trained when handed over to 
     * this method.
     * <P> <b>Why?</b> If the classifier does not abide to the Weka conventions, that a
     * classifier has to be re-initialized every time the buildClassifier 
     * method is called (in other words: subsequent calls to  
     * {@link buildClassifier} will always return the same results), you will get
     * inconsistent and worthless results.
     * 
     * @param model
     * @param data
     * @param numFolds
     * @see http://weka.sourceforge.net/wiki/index.php/Use_Weka_in_your_Java_code#Evaluating
     */
    public void crossValidateModel(Classifier classifier, Instances data, int numFolds) {
        try {
            Evaluation eval = new Evaluation(data);
            eval.crossValidateModel(classifier, data, numFolds, new Random(1));
            System.out.println(eval.toSummaryString("\nResults\n=======\n", true));
            System.out.println(eval.toClassDetailsString());
        } catch (Exception ex) {
            LOG.fatal(ex);
            throw new RuntimeException();
        }
    }
    
    /**
     * Run this classifier on the given unlabeled instance.
     *
     * @param unlabeled     the unlabeled instances (such as that returned from {@link #getInstanceSparse})
     * @param classAtribute
     * @return a LabeledInstance with the unlabeled instance labeled!
     * @throws java.lang.Exception for various reasons (Weka)
     */
    public LabeledInstance runClassifier(Instance unlabeled, Attribute classAtribute) 
            throws Exception 
    {
        if ( unlabeled == null )
            throw new IllegalArgumentException("Unlabeled Instance is null!");
        
        Classifier classifier = this.getClassifier();
        double clsLabel = classifier.classifyInstance(unlabeled);
        //Instance labeled = new Instance(unlabeled);
        unlabeled.setClassValue(clsLabel);
        LabeledInstance li = new LabeledInstance(unlabeled);
        li.clsDist = classifier.distributionForInstance(unlabeled);
        li.clsString = classAtribute.value((int) clsLabel);
        return li;
    }
    
    /**
     * Classify using the given {@link Classifier} and unlabled {@link Instances}.
     *
     * @param unlabeled     unlabeled instanstance containing a {@link Instances#classIndex}
     * @param classifier
     * @return labeled instances leaving the original instanses unlabeled.
     * @throws java.lang.Exception
     */
    public Instances runClassifier(Instances unlabeled, Classifier classifier) 
            throws Exception {

        // create copy
        Instances labeled = new Instances(unlabeled);
        Integer idx = labeled.classIndex();
        if (idx == -1) {
            LOG.warn("classIndex not set. Using last attribute for class!");
            labeled.setClassIndex(labeled.numAttributes() - 1);
            idx = labeled.classIndex();
        }
        
        // label instances, throws an exception
        for (int i = 0; i < unlabeled.numInstances(); i++) {
            if (i == idx) continue;
            double clsLabel = classifier.classifyInstance(unlabeled.instance(i));
            //clsLabled == Instance.missingValue() if no prediction is made
            labeled.instance(i).setClassValue(clsLabel);
            //double[] dist = classifier.distributionForInstance(unlabeled.instance(i));
            
            // Prints out the "string" class name, not the index
            if (LOG.isDebugEnabled()) {
                LOG.debug(String.format("%s -> %s", clsLabel, unlabeled.classAttribute().value((int) clsLabel)));
            }
            //System.out.println(clsLabel + " -> " + unlabeled.classAttribute().value((int) clsLabel));
        }
        
        // need to track sample name and values
        return labeled;
    }
    
    /**
     * Convert a complete composition vector to a Weka sparse instance.
     * <P>
     * Only composition features (n-mers/n-grams) that match attributes in the given model will be stored.
     * 
     * @param ccv
     * @return a sparese instance
     * @throws java.lang.Exception
     */
    public Instance getInstanceSparse(CompleteCompositionVector ccv) throws Exception {
        /** Need our model's features/nmers in their order */
        LinkedList<String> nmers = this.getNmers();

        //Instances m_Data = new Instances("CCV Sparse", this.getAttributes())
        Instances m_Data = new Instances("CCV Sparse", this.getAttributes(), 2);
        m_Data.setClass( (Attribute) this.getAttributes().elementAt(this.getClassIndex()) );
        double[] m_ValueBuffer = new double[m_Data.numAttributes()];
        int[] m_IndicesBuffer = new int[m_Data.numAttributes()];
        int valIndex, numValues = 0, maxIndex = -1;
        TreeSet<String> nmerSet = ccv.getNmerSet();
        
        for( String key : nmers ) {
            if ( !nmerSet.contains(key) ) {
                continue;
            }
            
            Integer ikey = nmers.indexOf(key);
            m_IndicesBuffer[numValues] = ikey;
            
            if (m_IndicesBuffer[numValues] <= maxIndex) {
                throw new Exception("Indices have to be ordered for Instance: " + key);
            }
            if ((m_IndicesBuffer[numValues] < 0) ||
                    (m_IndicesBuffer[numValues] >=
                    m_Data.numAttributes())) {
                throw new Exception("AbstractWekaCompleteCompositionVector." +
                        "getInstanceSparse:Index out of bounds for Instance: " 
                        + key);
            }
            maxIndex = m_IndicesBuffer[numValues];

            /** Now get the value. */
            Double value = ccv.getPiValueforNmer(key);

            /** We don't check the type since we expect only one type - NUMERIC */
            m_ValueBuffer[numValues] = value;

            numValues++;
        }
        
        //System.err.printf("AbstractWekaCompleteCompositionVector." +
        //        "getInstanceSparse: found %d nmers/features!\n", numValues);
        if (LOG.isDebugEnabled()) {
            LOG.debug(String.format("CompleteCompositionVector '%s' has %d features that matched %d attributes out of %d",
                    ccv.getName(), nmerSet.size(), numValues, nmers.size()));
        }
        
        /** some magic copying to create this instance. */
        double[] tempValues = new double[numValues];
        int[] tempIndices = new int[numValues];
        System.arraycopy(m_ValueBuffer, 0, tempValues, 0, numValues);
        System.arraycopy(m_IndicesBuffer, 0, tempIndices, 0, numValues);
        Instance inst = new SparseInstance(1, tempValues, tempIndices,
                m_Data.numAttributes());
        inst.setDataset(m_Data);

        return inst;
    }
    
    /**
     * Get the attributes.
     *
     * <p>The last one should be the class attribute. We can have attributes
     * without instances data, since we need these to build instances.
     * 
     * @return a <code>FastVector</code> attributes.
     */
    public FastVector getAttributes() {
        /** Weka doesn't have a simple method for returning this! */
        if ( this.attributes == null && this.getInstances() != null) {
            this.attributes =  new FastVector();
            for (Enumeration e = this.getInstances().enumerateAttributes(); 
                    e.hasMoreElements();) {
                this.attributes.addElement( (Attribute) e.nextElement());
            }
            /** enumerateAttributes conveniently leaves out the classAttribute! */
            this.attributes.addElement(this.getInstances().classAttribute());
        }
        return this.attributes;
    }
    
    public void setAttributes(FastVector attributes) {
        // DEBUG: should check for labeled data
        this.attributes = attributes;
    }
    
    /** ABSTRACT METHODS **/
    
    /**
     * Set the instances.
     * 
     * <p>This should <b>not</b> affect a previously built classifier. Also, this is <b>not</b>
     * guaranteed to set the class index ({@link #setClassIndex(java.lang.Integer)}).
     */
    abstract public void setInstances(Instances instances) ;

    /**
     * Return the <code>Instances</code>.
     */
    abstract public Instances getInstances() ;

    /**
     * Return a <code>LinkedList</code> of the n-mers(k-mers/n-grams) used as features
     */
    abstract public LinkedList<String> getNmers() ;

    /**
     * Set the n-mers(k-mers/n-grams) used as features (in the same order that they are used in the {@link Instances}).
     */
    abstract public void setNmers(LinkedList<String> nmers);

    /**
     * Build a classifier using the set <code>Instances</code>
     * @throws java.lang.Exception
     */
    abstract public void buildClassifier() throws Exception ;

    /**
     * set the underlining <tt>Weka</tt> <code>Classifier</code>
     */
    abstract public void setClassifier(Classifier classifer) ;

    /**
     * Return the underlining <tt>Weka</tt> <code>Classifier</code>
     */
    abstract public Classifier getClassifier() ;

    /**
     * Return the human readable name of the classifier
     */
    abstract public String getClassiferName() ;

    /**
     * Return the begining composition vector window size
     */
    abstract public Integer getBegin() ;

    /**
     * Set the begining composition vector window size
     */
    abstract public void setBegin(Integer begin) ;

    /**
     * Return the ending composition vector window size
     */
    abstract public Integer getEnd() ;

    /**
     * Set the ending composition vector window size
     */
    abstract public void setEnd(Integer end) ;

    /**
     * Set the class index (the instance to use for classification)
     */
    abstract public void setClassIndex(Integer idx);

    /**
     * Get the class index (the instance to use for classification)
     */
    abstract public Integer getClassIndex();
    
    /**
     * Internal class representing a labeled instance
     */    
    public class LabeledInstance {
        public double[] clsDist;
        public Instance inst;
        public String clsString;
        
        LabeledInstance(Instance inst) {
            this.inst = inst;
        }
    }
    
}
